* fix_outlier_ppsr50_with_imp_flags -- examines outliers in trailer data;
* THIS IS ON the PPSR>0.50 group;
* Copied from fix_outlier.sas;

** 10/23/04;
** Modified 3/31/14 to match edit/impute flags from recovered data. 


*options obs=5000;run;


data cmf7797;
 set cmf.cmf1977 (keep = ppn year cm)
     cmf.cmf1982 (keep = ppn year cm)
     cmf.cmf1987 (keep = ppn year cm)
     cmf.cmf1992 (keep = ppn year cm)
     cmf.cmf1997 (keep = ppn year cm);
run;

/** Need to rename CM variable, because the FHS dataset has an industry-level variable called CM */

proc datasets library=work;
 modify cmf7797;
  rename cm = plant_cm;
 run;

proc sort data=cmf7797; by ppn year; run;


%macro chk(d,i,s);

*      d =  product name;
*      s = last period of good data;    


       * START BY ONLY KEEPING OBS WITH PHYSICAL DATA;

       /* Read in product datasets with edit/impute flags for all main plant-level variables. */

       data all;
        set dbflags.phy&d._with_flags;
	if aphyflag>0;
	if ppsr1>0.50;
	mer=1;
	if year<=&s;

        * MERGE IN PQS_IMP_MEAN VARIABLE (CONSTRUCTED FROM RECOVERED CMF PRODUCT TRAILERS) ;
        * (this includes the PV_imp_mean variable for some industries ;

        proc datasets library=work;
         modify &d;
         rename pqs = my_pqs;
        run;

        proc sort data=&d; by ppn year; run;        
        proc sort data=all; by ppn year; run;

        data all rej_mine;
          merge all (in=inall) &d (in=inmine);
         by ppn year;
         if inall and inmine then output all;
         else if inmine then output rej_mine;
        run;

        * Merge in plant-level cm variable from raw CMFs;

        data all inall_not_in_cmf;
         merge all (in=inall) cmf7797 (in=incmf);
         by ppn year;
         if inall and incmf then output all;
         else if inall then output inall_not_in_cmf;  * inall_not_in_cmf should have 0 obs;
        run;

        * Identify PQS imputes using FHS's reverse-engineering method with different rounding criteria;


 

	* 1) IMPUTE DELETIONS --- replace imputes with missing values;


	     /* REPLACE WITH MISSING VALUES ANY IMPUTED VARIABLE 
                THAT GETS EXPORTED TO THE CART SCRIPTS
                EXCEPT TE AND SW. */ 
	     data all;
               set all; 
	       if nww_f_imp =1  
               then do;
                   ww = .;
               end;  
	       if npw_f_imp =1  
               then do;
                   pw = .;
               end;  
	       if nph_f_imp =1  
               then do;
                   ph = .;
               end;  
	       if nee_f_imp =1  
               then do;
                   ee = .;
               end;  
	       if ncf_f_imp =1  
               then do;
                   cf = .;
               end;  
	       if ncm_f_imp =1  
               then do;
                   plant_cm = .;
               end;  
	       if ncp_f_imp =1  
               then do;
                   cp = .;
               end;  
	       if ntvs_f_imp =1  
               then do;
                   tvs = .;
               end;  
	       if ntib_f_imp =1 or ntie_f_imp = 1 
               then do;
                   dinv = .;
               end;  
               if pqs_imp_mean > 0.50 
               then do;
                 pqs_imp = 1;
                 pqs = .;
               end;
               if pv_imp_mean > 0 
               then do;
                 pv_imp = 1;
                 pv = .;
               end;

	      run;

          

       * 4.1) MATCH TO FINAL FHS SAMPLE ; 

	     /*** Read in PPN-years from FHS's final sample */
             data all&d (keep = ppn year);
              set fhspen50.all&d;
             run;
             
             proc sort data=all&d; by ppn year; run;
             proc sort data=all; by ppn year; run;
             
             data onlyinFHS_&d matched&d;
               merge all&d (in=infhs) all (in=inall);
               by ppn year;
               if infhs=1 and inall=0 then output onlyinFHS_&d;   /** onlyinFHS_&d should be empty--check this **/
               else if infhs and inall then output matched&d;
             run;

             /* Include all plants in FHS final sample. */
             data all;
               set matched&d onlyinFHS_&d;
             run;

              proc sort data=all; by year; run;

	      data db50im.all&d;
	        set all;
		run;

%mend;

  
 data bredf77;  set fhs7797.bredf77; year=1977; run;
 data bredf82;  set fhs7797.bredf82; year=1982; run;
 data bredf87;  set fhs7797.bredf87; year=1987; run;
 data bredf92;  set fhs7797.bredf92; year=1992; run;
 data bredf97;  set fhs7797.bredf97; year=1997; run;

/*** Just getting the PQS and PV imputes for merging with the rest of the data. 
     For multi-product industries (like boxes) the pqs_imp_mean is the 
     plant-level imputation rate--the fraction of products from that plants 
     with imputed PQS.  Likewise for pv_imp_mean. 
     For single-product industries (like concrete), pqs_imp_mean
     and pv_imp_mean are 0 or 1 for a given plant. ***/

 data bredf (keep = ppn year prod pqs_imp_mean pv_imp_mean pqs pv); 
  set bredf77 bredf82 bredf87 bredf92 bredf97;
  pv_imp_mean  = 0;
  prod = "bred";
 run;

 %chk(bredf,2051,1997);



 data cofff77;  set fhs7797.cofff77; year=1977; run;
 data cofff82;  set fhs7797.cofff82; year=1982; run;
 data cofff87;  set fhs7797.cofff87; year=1987; run;
 data cofff92;  set fhs7797.cofff92; year=1992; run;
 data cofff97;  set fhs7797.cofff97; year=1997; run;

 data cofff (keep = ppn year prod pqs_imp_mean pv_imp_mean pqs pv); 
  set cofff77 cofff82 cofff87 cofff92 cofff97;
  prod = "coff";
 run;

 %chk(cofff,2095,1997);

 data icebf77;  set fhs7797.icebf77; year=1977; run;
 data icebf82;  set fhs7797.icebf82; year=1982; run;
 data icebf87;  set fhs7797.icebf87; year=1987; run;
 data icebf92;  set fhs7797.icebf92; year=1992; run;
 data icebf97;  set fhs7797.icebf97; year=1997; run;

 data icebf (keep = ppn year prod pqs_imp_mean pv_imp_mean pqs pv); 
  set icebf77 icebf82 icebf87 icebf92 icebf97;
  pv_imp_mean  = 0;
  prod = "iceb";

 run;

 %chk(icebf,2097,1997);  

 data icepf77;  set fhs7797.icepf77; year=1977; run;
 data icepf82;  set fhs7797.icepf82; year=1982; run;
 data icepf87;  set fhs7797.icepf87; year=1987; run;

 data icepf (keep = ppn year prod pqs_imp_mean pv_imp_mean pqs pv); 
  set icepf77 icepf82 icepf87 ;
  pv_imp_mean  = 0;
  prod = "icep";
 run;

 %chk(icepf,2097,1987);  

 data flrf77;  set fhs7797.floorf77; year=1977; run;
 data flrf82;  set fhs7797.floorf82; year=1982; run;
 data flrf87;  set fhs7797.floorf87; year=1987; run;
 data flrf92;  set fhs7797.floorf92; year=1992; run;
 data flrf97;  set fhs7797.floorf97; year=1997; run;

 data flrf (keep = ppn year prod pqs_imp_mean pv_imp_mean pqs pv); 
  set flrf77 flrf82 flrf87 flrf92 flrf97;
  prod = "floor";
 run;

 %chk(flrf,2426,1997);

 data plyf77;  set fhs7797.plyf77; year=1977; run;
 data plyf82;  set fhs7797.plyf82; year=1982; run;
 data plyf87;  set fhs7797.plyf87; year=1987; run;
 data plyf92;  set fhs7797.plyf92; year=1992; run;
 data plyf97;  set fhs7797.plyf97; year=1997; run;

 data plyf (keep = ppn year prod pqs_imp_mean pv_imp_mean pqs pv); 
  set plyf77 plyf82 plyf87 plyf92 plyf97;
  prod = "ply";
 run;

 %chk(plyf,2435,1997);

 data boxf77;  set fhs7797.boxf77; year=1977; run;
 data boxf82;  set fhs7797.boxf82; year=1982; run;
 data boxf87;  set fhs7797.boxf87; year=1987; run;

 data boxf (keep = ppn year prod pqs_imp_mean pv_imp_mean pqs pv); 
  set boxf77 boxf82 boxf87;
  pv_imp_mean  = 0;
  prod = "boxes";
 run;

 %chk(boxf,2653,1987);   

 data carbonf77;  set fhs7797.carbonf77; year=1977; run;
 data carbonf82;  set fhs7797.carbonf82; year=1982; run;
 data carbonf87;  set fhs7797.carbonf87; year=1987; run;
 data carbonf92;  set fhs7797.carbonf92; year=1992; run;
 data carbonf97;  set fhs7797.carbonf97; year=1997; run;

 data carbonf (keep = ppn year prod pqs_imp_mean pv_imp_mean pqs pv); 
  set carbonf77 carbonf82 carbonf87 carbonf92 carbonf97;
  pv_imp_mean  = 0;
  prod = "carbon";
 run;

 %chk(carbonf,2895,1997);

 data gasf77;  set fhs7797.gasf77; year=1977; run;
 data gasf82;  set fhs7797.gasf82; year=1982; run;
 data gasf87;  set fhs7797.gasf87; year=1987; run;
 data gasf92;  set fhs7797.gasf92; year=1992; run;
 data gasf97;  set fhs7797.gasf97; year=1997; run;

 data gasf (keep = ppn year prod pqs_imp_mean pv_imp_mean pqs pv); 
  set gasf77 gasf82 gasf87 gasf92 gasf97;
  prod = "gas";
 run;

 %chk(gasf,2911,1997);

 data concf77;  set fhs7797.concf77; year=1977; run;
 data concf82;  set fhs7797.concf82; year=1982; run;
 data concf87;  set fhs7797.concf87; year=1987; run;
 data concf92;  set fhs7797.concf92; year=1992; run;

 data concf (keep = ppn year prod pqs_imp_mean pv_imp_mean pqs pv); 
  set concf77 concf82 concf87 concf92;
  pv_imp_mean  = 0;
  prod = "conc";
 run;
 
 %chk(concf,3273,1992); 

 data sugf77;  set fhs7797.sugf77; year=1977; run;
 data sugf82;  set fhs7797.sugf82; year=1982; run;
 data sugf87;  set fhs7797.sugf87; year=1987; run;
 data sugf92;  set fhs7797.sugf92; year=1992; run;
 data sugf97;  set fhs7797.sugf97; year=1997; run;

 /* I'm not going to do CART-imputes for sugar, because
   there are so few PQS imputations.
   So just make pqs_imp_mean =0 so and use all the FHS sample. */ 
 data sugf (keep = ppn year prod pqs_imp_mean pv_imp_mean pqs pv); 
  set sugf77 sugf82 sugf87 sugf92 sugf97;
  pqs_imp_mean = 0;
  pv_imp_mean  = 0;
  prod = "sugar";
 run;

 %chk(sugf,2061,1997);
